
##################################################################
##################################################################
## Replication Material
## Tobias Widmann:  Do Politicians Appeal to Discrete Emotions? The Effect of Wind Turbine Construction on Elite Discourse 
## Journal of Politics
## widmann@ps.au.dk
##
## Script 001: Data Preparation
##################################################################
##################################################################

# Note: The file 000_README.pdf describes all scripts and datasets required to replicate the analysis

# This script was run on the following R version, platform and OS:
# R version 4.0.5 (2021-03-31)
# Platform: x86_64-apple-darwin17.0 (64-bit)
# Running under: macOS Big Sur 11.5.1

#### Set Working Directory to the Replication Folder# ###########################

# Set the directory to the replication folder
setwd("./files")


#### PACKAGES ######################################################

install.packages("devtools", version = "2.4.0")
install.packages("groundhog", version = "3.1.1")

library("groundhog")
pckgs <- c("stm", "rgdal", "mapproj")
groundhog.library(pckgs, "2021-05-01")

devtools::install_version("tidyverse", version = "1.3.1")
devtools::install_version("tm", version = "0.7-8")
devtools::install_version("Rtsne", version = "0.15")
devtools::install_version("geometry", version = "0.4.5")
devtools::install_version("rsvd", version = "1.0.5")
devtools::install_version("dplyr", version = "1.0.5")
devtools::install_version("openxlsx", version = "4.2.3")
devtools::install_version("tidytext", version = "0.3.1")
devtools::install_version("quanteda", version = "3.0.0")
devtools::install_version("plm", version = "2.4-1")
devtools::install_version("lmtest", version = "0.9-40")
devtools::install_version("dotwhisker", version = "0.7.4")
devtools::install_version("pbkrtest", version = "0.5.1")
devtools::install_version("car", version = "3.1-2")
devtools::install_version("rstatix", version = "0.7.2")
devtools::install_version("ggpubr", version = "0.4.0")


library(devtools)
library(stm)
library(tidyverse)
library(tm)
library(Rtsne)
library(geometry)
library(rsvd)
library(dplyr)
library(openxlsx)
library(tidytext)
library(quanteda)
library(plm)
library(lmtest)
library(dotwhisker)
library(pbkrtest)
library(car)
library(rstatix)
library(ggpubr)

#### Preparing Parliament Data #############################################


##### Load Data from ParlSpeech ############################################
# Raw parliamentary data
load("./parl_speech_raw.Rdata")


##### PLZ ###################################################################
# Load data with postal codes and electoral districts
load("./postal_codes.Rdata")



##### Windturbines Treatment ################################################
#Load wind turbine information
load("./wind_data.Rdata")

### Binary Treatment Variable
#Assign treatment variables (binary)
wind_data_unique <- wind_data %>% 
  group_by(wahlkreis) %>%
  filter(date == min(date))


wind_data_unique$plz <- NULL
wind_data_unique <- unique(wind_data_unique[,1:3])
save(wind_data_unique, file = "./wind_data_unique.Rdata")

wind_ps_raw$treat_indirect <- 0
for (i in 1:length(wind_data_unique$wahlkreis)){
  wind_ps_raw$treat_indirect[wind_ps_raw$wkindirekt==wind_data_unique$wahlkreis[i] & wind_ps_raw$date>=wind_data_unique$date[i]] <- 1
}


wind_ps_raw$wkdirekt[is.na(wind_ps_raw$wkdirekt)] <- 0
wind_ps_raw$treat_direct <- 0
for (i in 1:length(wind_data_unique$wahlkreis)){
  wind_ps_raw$treat_direct[wind_ps_raw$wkdirekt==wind_data_unique$wahlkreis[i] & wind_ps_raw$date>=wind_data_unique$date[i]] <- 1
}


### Count Data
#Assign treatment variables (count)

wind_ps_raw$count <- 0
df_cap <- wind_data

df_cap$capacity <- df_cap$date <- df_cap$plz <- df_cap$date2 <- df_cap$count <- NULL
df_cap$former_cap <- 0

df_cap <- unique(df_cap[,1:2])

for (i in 1:length(wind_data$wahlkreis)){
  wind_ps_raw$count[wind_ps_raw$wkindirekt==wind_data$wahlkreis[i] & wind_ps_raw$date>=wind_data$date[i]] <- sum(df_cap$former_cap[df_cap$wahlkreis==wind_data$wahlkreis[i]][1], 1)
  df_cap$former_cap[df_cap$wahlkreis==wind_data$wahlkreis[i]] <- sum(df_cap$former_cap[df_cap$wahlkreis==wind_data$wahlkreis[i]][1], 1)
}


##### Keyword Approach #############################################################
#Apply Keyword string to identify speeches dealing with green topics

# Convert the 'text' column of the 'wind_ps_raw' data frame to lowercase and store it in a new column 'sent.text'.
wind_ps_raw$sent.text <- tolower(wind_ps_raw$text) 

# Define a vector 'keywords2' containing keywords
keywords2 <- c("eeg", "solar", "windkraft", "windenergie", "windpower", "windanlage", "erneuerbar", "strom",
               "elektri", "kwk", "ökol", "repowering", "emission", "photovoltaik",
               "fotovoltaik", "wärme", "kraftstoff", "klima", "shore", "kohle", 
               "erwärmung", "wasserkraft", "wasserenergie", "geothermie", "kraftwerk", "turbine", 
               "kernkraft", "kernenergie", "klima", "umwelt", "erderwärmung", "umweltverschmutzung")

# Create a new column 'environment2' in the 'wind_ps_raw' data frame and initialize it with 0.
wind_ps_raw$environment2 <- 0
# If any of the keywords from 'keywords2' is present in the 'sent.text' column, set the corresponding 'environment2' entry to 1.
wind_ps_raw$environment2[grepl(paste(keywords2, collapse = "|"), wind_ps_raw$sent.text)] <- 1

# Create a new column 'environment_count2' in the 'wind_ps_raw' data frame.
# This column will store the count of keywords from 'keywords2' present in each entry of the 'sent.text' column.
wind_ps_raw$environment_count2 <- str_count(wind_ps_raw$sent.text, paste(keywords2, collapse = "|"))

# Create a new column 'twoabove7' in the 'wind_ps_raw' data frame and initialize it with 0.
wind_ps_raw$twoabove7 <- 0
# If the count of keywords in 'environment_count2' is greater than 7, set the corresponding 'twoabove7' entry to 1.
wind_ps_raw$twoabove7[wind_ps_raw$environment_count2>7] <- 1


##### Topic Model ######################################################################

# Preprocessing
processed_test <- textProcessor(wind_ps_raw$text, metadata=wind_ps_raw, language="german", stem=TRUE, lowercase = TRUE,
                                removestopwords = TRUE, removenumbers = TRUE, removepunctuation = TRUE, onlycharacter = TRUE, striphtml = TRUE)


out <- prepDocuments(processed_test$documents, processed_test$vocab, processed_test$meta, lower.thresh = 50, upper.thresh = 5000)

docs <- out$documents
vocab <- out$vocab
meta <- out$meta


#Estimating the topic model
tmFit_ps <- stm(documents = out$documents, vocab = out$vocab,
                K = 0, prevalence = ~ party + treat_indirect,
                max.em.its = 10, data = out$meta, seed = 1111,
                init.type = "Spectral")

# Discover Words that belong to Topics
labelTopics(tmFit_ps)

# Assign topics to dataframe and choose the biggest one
tdf_ps <- cbind(meta, tmFit_ps$theta)

tdf_ps$topic <- apply(tdf_ps[,c(16:100)],1,which.max)


wind_parl <- tdf_ps

colnames(wind_parl)[31] <- "topic16"
colnames(wind_parl)[71] <- "topic56"
glimpse(wind_parl)
##### Tokenizing ############################################################################

#Bringing the dataframe on the sentence level
wind_ps_raw_sentences <- wind_parl %>%
  unnest_tokens(sentence, text, token = "sentences")

colnames(wind_ps_raw_sentences)[101] <- "text"
wind_ps_raw_sentences <- wind_ps_raw_sentences %>% arrange(desc(text))

##### Electra Model #################################################################

#Save dataframe as csv in order to apply Transformer Model (Widmann & Wich 2022)
electra_ps_raw <- wind_ps_raw_sentences[,c(2,101)]
write.csv(electra_ps_raw, file = "./electra_ps_raw.csv")


# Read results after applying Transformer Model (Widmann & Wich 2022)
electra_ps_infered <- read.csv("./electra_ps_results.csv", fill = TRUE, na.strings = "NA")

#Prepare for merging results with dataframe
electra_ps_infered$X <- NULL
colnames(electra_ps_infered)[2] <- "el.anger"
colnames(electra_ps_infered)[3] <- "el.fear"
colnames(electra_ps_infered)[4] <- "el.disgust"
colnames(electra_ps_infered)[5] <- "el.sadness"
colnames(electra_ps_infered)[6] <- "el.joy"
colnames(electra_ps_infered)[7] <- "el.enthusiasm"
colnames(electra_ps_infered)[8] <- "el.pride"
colnames(electra_ps_infered)[9] <- "el.hope"

electra_ps_infered <- electra_ps_infered %>% arrange(desc(text))
electra_ps_infered <- electra_ps_infered[,2:9]

#Merging
ws_parl <- cbind(wind_ps_raw_sentences, electra_ps_infered)



##### Apply Moral Dictionary #############################################

#Apply the dictionary from Bos & Minihold (2022)
# Note: dictionary has been taken from Appendix of Bos & Minihold and transferred
# manually into a yml file in order to use it in the quanteda environment
mrd_de_mb <- dictionary(file = "./mrd_de_mb.yml",
                        format = "YAML")

#ed8 <- dictionary(mrd_de)
dic <- dictionary(mrd_de_mb)

# Create an function to apply dictionary
apply_dic <- function(data){
  #Create a corpus from your data frame
  corp <- corpus(data)
  
  #Tokenize corpus and pre-process (remove punctuations, numbers, and urls)
  toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE)
  
  #Create DFM just to measure number of terms before removing stopwords
  terms_dfm <- dfm(toks)
  
  #Turn tokens into DFM, remove stopwords
  emo_dfm <- dfm(toks, remove = stopwords("de"))
  
  #Apply dictionary
  dict_dfm_results <- dfm_lookup(emo_dfm, dic)
  
  #Convert results back to data frame
  results_df <- cbind(data, convert(dict_dfm_results, to = 'data.frame'))
  
  #Assign length to each documents
  results_df$terms_raw <- ntoken(terms_dfm)
  results_df$terms <- ntoken(emo_dfm)
  
  return(results_df)
}

# Now apply function
ws_parl <- apply_dic(ws_parl)


# Finally, I create normalized scores by dividing the moral scores by document length
ws_parl$mfd.care.norm <- ws_parl$mrd_de_mb.de.CARE / ws_parl$terms
ws_parl$mfd.harm.norm <- ws_parl$mrd_de_mb.de.HARM / ws_parl$terms
ws_parl$mfd.fairness.norm <- ws_parl$mrd_de_mb.de.FAIRNESS / ws_parl$terms
ws_parl$mfd.cheating.norm <- ws_parl$mrd_de_mb.de.CHEATING / ws_parl$terms
ws_parl$mfd.loyalty.norm <- ws_parl$mrd_de_mb.de.LOYALTY / ws_parl$terms
ws_parl$mfd.betrayal.norm <- ws_parl$mrd_de_mb.de.BETRAYAL / ws_parl$terms
ws_parl$mfd.authority.norm <- ws_parl$mrd_de_mb.de.AUTHORITY / ws_parl$terms
ws_parl$mfd.subversion.norm <- ws_parl$mrd_de_mb.de.SUBVERSION / ws_parl$terms
ws_parl$mfd.sanctity.norm <- ws_parl$mrd_de_mb.de.SANCTITY / ws_parl$terms
ws_parl$mfd.degradation.norm <- ws_parl$mrd_de_mb.de.DEGRADATION / ws_parl$terms
ws_parl$mfd.general.norm <- ws_parl$mrd_de_mb.de.GENERAL / ws_parl$terms
ws_parl$mfd.positive.norm <- ws_parl$mrd_de_mb.de.POSITIVE / ws_parl$terms
ws_parl$mfd.negative.norm <- ws_parl$mrd_de_mb.de.NEGATIVE / ws_parl$terms
ws_parl$mfd.morality.norm <- ws_parl$mrd_de_mb.de.MORALITY / ws_parl$terms


#Creating a purity variable by adding both purity domains and dividing it by 2
ws_parl$purity.norm <- (ws_parl$mfd.sanctity.norm+ws_parl$mfd.degradation.norm)/2
summary(ws_parl$purity.norm)

#Creating a care/harm variable by adding both care + harm and dividing it by 2
ws_parl$care_harm.norm <- (ws_parl$mfd.care.norm+ws_parl$mfd.harm.norm)/2
summary(ws_parl$care_harm.norm )

#Creating a fairness/injustice + care/harm variable by adding both domains (each with positive + negative) and dividing it by 4
ws_parl$fairness_care.norm <- (ws_parl$mfd.fairness.norm+ws_parl$mfd.cheating.norm+ws_parl$mfd.care.norm+ws_parl$mfd.harm.norm)/4
summary(ws_parl$fairness_care.norm )


##### Fix Party Names and Order #################################################
# Change party names to English and set order
ws_parl$party[ws_parl$party=="GRUENE"] <- "Greens"
ws_parl$party[ws_parl$party=="PDS/LINKE"] <- "The Left"

ws_parl$party <- factor(ws_parl$party, levels = c("The Left","Greens","SPD","FDP","CDU/CSU", "AfD"))


##### Save Parliament File #######################################################
# Save prepared file for later use
save(ws_parl, file = "./parl_speech.Rdata")


#### Preparing Twitter Data ######################################################
# Due to Twitter's (or X's) data sharing policy, it is not allowed to share complete datasets of tweets.
# Instead, researchers are allowed to share tweet IDs only. 
# Hence, it is not possible to share the raw tweet data that is necessary for preparing
# the annoted twitter data.
# Hence, I will only share the prepared twitter data which is already annotated.

# The prepared twitter files are:
load("./wind_tweets.Rdata")
load("./twitter_replies.Rdata")
load("./sz_tweets.Rdata")
load("./spd_tweets.Rdata")
load("./linke_tweets.Rdata")
load("./afd_tweets.Rdata")


